import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df= pd.read_csv('https://raw.githubusercontent.com/sdhar-pycourse/py-datascience-biz/main/practise-datasets/Leads.csv', index_col=0)


df.shape

(9240, 36)


for i,c in enumerate(df.columns):
  print(i,c)

0 Lead Number
1 Lead Origin
2 Lead Source
3 Do Not Email
4 Do Not Call
5 Converted
6 TotalVisits
7 Total Time Spent on Website
8 Page Views Per Visit
9 Last Activity
10 Country
11 Specialization
12 How did you hear about X Education
13 What is your current occupation
14 What matters most to you in choosing a course
15 Search
16 Magazine
17 Newspaper Article
18 X Education Forums
19 Newspaper
20 Digital Advertisement
21 Through Recommendations
22 Receive More Updates About Our Courses
23 Tags
24 Lead Quality
25 Update me on Supply Chain Content
26 Get updates on DM Content
27 Lead Profile
28 City
29 Asymmetrique Activity Index
30 Asymmetrique Profile Index
31 Asymmetrique Activity Score
32 Asymmetrique Profile Score
33 I agree to pay the amount through cheque
34 A free copy of Mastering The Interview
35 Last Notable Activity


df.columns

Index(['Lead Number', 'Lead Origin', 'Lead Source', 'Do Not Email',
       'Do Not Call', 'Converted', 'TotalVisits',
       'Total Time Spent on Website', 'Page Views Per Visit', 'Last Activity',
       'Country', 'Specialization', 'How did you hear about X Education',
       'What is your current occupation',
       'What matters most to you in choosing a course', 'Search', 'Magazine',
       'Newspaper Article', 'X Education Forums', 'Newspaper',
       'Digital Advertisement', 'Through Recommendations',
       'Receive More Updates About Our Courses', 'Tags', 'Lead Quality',
       'Update me on Supply Chain Content', 'Get updates on DM Content',
       'Lead Profile', 'City', 'Asymmetrique Activity Index',
       'Asymmetrique Profile Index', 'Asymmetrique Activity Score',
       'Asymmetrique Profile Score',
       'I agree to pay the amount through cheque',
       'A free copy of Mastering The Interview', 'Last Notable Activity'],
      dtype='object')


handle= ['prospsid', 'leadnum', 'origin', 'dne', 'dnc', 'target', 'totviz', 'tos', 'pageviz', 'lastact', 'country', 'splz', 
         'leadsrc', 'curoccp', 'reason', 'srch', 'mag', 'newsart', 'edforum', 'newsad', 'digitalad', 'reco', 'updates', 'tag', 'leadquality', 'supchncont', 
         'dmcontent', 'leadprofile', 'city', 'asymactidx', 'asymprofidx', 'asymactscr', 'asymprofscr', 'chkpay', 'freecp', 'lastnotact']


column_map= dict(zip(df.columns, handle))


column_map

{'A free copy of Mastering The Interview': 'freecp',
 'Asymmetrique Activity Index': 'asymactidx',
 'Asymmetrique Activity Score': 'asymactscr',
 'Asymmetrique Profile Index': 'asymprofidx',
 'Asymmetrique Profile Score': 'asymprofscr',
 'City': 'city',
 'Converted': 'target',
 'Country': 'country',
 'Digital Advertisement': 'digitalad',
 'Do Not Call': 'dnc',
 'Do Not Email': 'dne',
 'Get updates on DM Content': 'dmcontent',
 'How did you hear about X Education': 'leadsrc',
 'I agree to pay the amount through cheque': 'chkpay',
 'Last Activity': 'lastact',
 'Last Notable Activity': 'lastnotact',
 'Lead Number': 'prospsid',
 'Lead Origin': 'leadnum',
 'Lead Profile': 'leadprofile',
 'Lead Quality': 'leadquality',
 'Lead Source': 'origin',
 'Magazine': 'mag',
 'Newspaper': 'newsad',
 'Newspaper Article': 'newsart',
 'Page Views Per Visit': 'pageviz',
 'Receive More Updates About Our Courses': 'updates',
 'Search': 'srch',
 'Specialization': 'splz',
 'Tags': 'tag',
 'Through Recommendations': 'reco',
 'Total Time Spent on Website': 'tos',
 'TotalVisits': 'totviz',
 'Update me on Supply Chain Content': 'supchncont',
 'What is your current occupation': 'curoccp',
 'What matters most to you in choosing a course': 'reason',
 'X Education Forums': 'edforum'}


df.rename(columns=column_map, inplace=True)


df.columns

Index(['prospsid', 'leadnum', 'origin', 'dne', 'dnc', 'target', 'totviz',
       'tos', 'pageviz', 'lastact', 'country', 'splz', 'leadsrc', 'curoccp',
       'reason', 'srch', 'mag', 'newsart', 'edforum', 'newsad', 'digitalad',
       'reco', 'updates', 'tag', 'leadquality', 'supchncont', 'dmcontent',
       'leadprofile', 'city', 'asymactidx', 'asymprofidx', 'asymactscr',
       'asymprofscr', 'chkpay', 'freecp', 'lastnotact'],
      dtype='object')


df.target.isnull().sum()

0


df.target.hist(bins=20, figsize=(6,6), color= 'orange', legend=True)

<matplotlib.axes._subplots.AxesSubplot at 0x7f523e720910>


cat_vars= df.columns[df.dtypes== 'object'].tolist()


cat_vars

['leadnum',
 'origin',
 'dne',
 'dnc',
 'lastact',
 'country',
 'splz',
 'leadsrc',
 'curoccp',
 'reason',
 'srch',
 'mag',
 'newsart',
 'edforum',
 'newsad',
 'digitalad',
 'reco',
 'updates',
 'tag',
 'leadquality',
 'supchncont',
 'dmcontent',
 'leadprofile',
 'city',
 'asymactidx',
 'asymprofidx',
 'chkpay',
 'freecp',
 'lastnotact']


num_vars=df.columns[(df.dtypes!= 'object')].tolist()


num_vars.pop(1)

'target'


num_vars

['prospsid', 'totviz', 'tos', 'pageviz', 'asymactscr', 'asymprofscr']


df.loc[:,cat_vars].describe().T


df.loc[:,num_vars].describe()


df.totviz.hist(grid=True, bins=100, figsize=(7,7), color='red')
plt.xlabel('Total Visit')

Text(0.5, 0, 'Total Visit')


df.totviz.describe(percentiles=[.05,.25,.50,.75,.95])

count    9103.000000
mean        3.445238
std         4.854853
min         0.000000
5%          0.000000
25%         1.000000
50%         3.000000
75%         5.000000
95%        10.000000
max       251.000000
Name: totviz, dtype: float64


df.totviz.describe()

count    9103.000000
mean        3.445238
std         4.854853
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: totviz, dtype: float64


100*df.totviz.isnull().sum()/ len(df)

1.4826839826839826


df.totviz.fillna(df.totviz.median(), axis=0, inplace=True)


df.totviz.fillna(df.totviz.median(), axis=0, inplace= True)


100*df.totviz.isnull().sum()/ len(df)

0.0


df.totviz.describe()

count    9240.000000
mean        3.438636
std         4.819024
min         0.000000
25%         1.000000
50%         3.000000
75%         5.000000
max       251.000000
Name: totviz, dtype: float64


df1= df.copy(deep=True)
df1.isnull().sum()

prospsid          0
leadnum           0
origin           36
dne               0
dnc               0
target            0
totviz            0
tos               0
pageviz         137
lastact         103
country        2461
splz           1438
leadsrc        2207
curoccp        2690
reason         2709
srch              0
mag               0
newsart           0
edforum           0
newsad            0
digitalad         0
reco              0
updates           0
tag            3353
leadquality    4767
supchncont        0
dmcontent         0
leadprofile    2709
city           1420
asymactidx     4218
asymprofidx    4218
asymactscr     4218
asymprofscr    4218
chkpay            0
freecp            0
lastnotact        0
dtype: int64


df1.dropna(inplace=True)


df1.isnull().sum()

prospsid       0
leadnum        0
origin         0
dne            0
dnc            0
target         0
totviz         0
tos            0
pageviz        0
lastact        0
country        0
splz           0
leadsrc        0
curoccp        0
reason         0
srch           0
mag            0
newsart        0
edforum        0
newsad         0
digitalad      0
reco           0
updates        0
tag            0
leadquality    0
supchncont     0
dmcontent      0
leadprofile    0
city           0
asymactidx     0
asymprofidx    0
asymactscr     0
asymprofscr    0
chkpay         0
freecp         0
lastnotact     0
dtype: int64


df1.shape

(1943, 36)


sns.catplot(data=df, x='totviz', kind= 'box', orient='h', aspect=3)

<seaborn.axisgrid.FacetGrid at 0x7f523e046d50>


p5=df.totviz.describe(percentiles=[.05,.25,.50,.75,.95]).loc['5%']
p95= df.totviz.describe(percentiles=[.05,.25,.50,.75,.95]).loc['95%']


df.totviz.describe(percentiles=[.05,.25,.50,.75,.95]).loc['95%']

10.0


df["totviz"] = np.where(df["totviz"] <p5, p5,df['totviz'])
df["totviz"] = np.where(df["totviz"] >p95, p95,df['totviz'])


df["totviz"] <p5

Prospect ID
7927b2df-8bba-4d29-b9a2-b6e0beafe620    False
2a272436-5132-4136-86fa-dcc88c88f482    False
8cc8c611-a219-4f35-ad23-fdfd2656bd8a    False
0cc2df48-7cf4-4e39-9de9-19797f9b38cc    False
3256f628-e534-4826-9d63-4a8b88782852    False
                                        ...  
19d6451e-fcd6-407c-b83b-48e1af805ea9    False
82a7005b-7196-4d56-95ce-a79f937a158d    False
aac550fe-a586-452d-8d3c-f1b62c94e02c    False
5330a7d1-2f2b-4df4-85d6-64ca2f6b95b9    False
571b5c8e-a5b2-4d57-8574-f2ffb06fdeff    False
Name: totviz, Length: 9240, dtype: bool


sns.catplot(data=df, x='totviz', kind= 'box', orient='h', aspect=3)

<seaborn.axisgrid.FacetGrid at 0x7f5239490e90>


df.columns

Index(['prospsid', 'leadnum', 'origin', 'dne', 'dnc', 'target', 'totviz',
       'tos', 'pageviz', 'lastact', 'country', 'splz', 'leadsrc', 'curoccp',
       'reason', 'srch', 'mag', 'newsart', 'edforum', 'newsad', 'digitalad',
       'reco', 'updates', 'tag', 'leadquality', 'supchncont', 'dmcontent',
       'leadprofile', 'city', 'asymactidx', 'asymprofidx', 'asymactscr',
       'asymprofscr', 'chkpay', 'freecp', 'lastnotact'],
      dtype='object')


cat_vars

['leadnum',
 'origin',
 'dne',
 'dnc',
 'lastact',
 'country',
 'splz',
 'leadsrc',
 'curoccp',
 'reason',
 'srch',
 'mag',
 'newsart',
 'edforum',
 'newsad',
 'digitalad',
 'reco',
 'updates',
 'tag',
 'leadquality',
 'supchncont',
 'dmcontent',
 'leadprofile',
 'city',
 'asymactidx',
 'asymprofidx',
 'chkpay',
 'freecp',
 'lastnotact']


100*df.loc[:, null_cat_vars].isnull().sum()/ len(df)

origin          0.389610
lastact         1.114719
country        26.634199
splz           15.562771
leadsrc        23.885281
curoccp        29.112554
reason         29.318182
tag            36.287879
leadquality    51.590909
leadprofile    29.318182
city           15.367965
asymactidx     45.649351
asymprofidx    45.649351
dtype: float64


null_cat_vars= df.loc[:, cat_vars].isnull().sum()[df.loc[:, cat_vars].isnull().sum()> 0].index.tolist()


null_cat_vars

['lastact',
 'country',
 'splz',
 'leadsrc',
 'curoccp',
 'reason',
 'tag',
 'leadquality',
 'leadprofile',
 'city',
 'asymactidx',
 'asymprofidx']


df.origin.describe()

count       9204
unique        21
top       Google
freq        2868
Name: origin, dtype: object


df.origin.mode()

0    Google
dtype: object


df.origin.value_counts(normalize=False, dropna=True)

Google               2868
Direct Traffic       2543
Olark Chat           1755
Organic Search       1154
Reference             534
Welingak Website      142
Referral Sites        125
Facebook               55
bing                    6
google                  5
Click2call              4
Press_Release           2
Social Media            2
Live Chat               2
NC_EDM                  1
welearnblog_Home        1
blog                    1
Pay per Click Ads       1
testone                 1
youtubechannel          1
WeLearn                 1
Name: origin, dtype: int64


df.origin.fillna(df.origin.mode(), inplace=True)


hi_nulls= df.loc[:, null_cat_vars].isnull().sum()[100*df.loc[:, null_cat_vars].isnull().sum()/ len(df) > 30].index.tolist()


df.loc[:, null_cat_vars].isnull().sum()[100*df.loc[:, null_cat_vars].isnull().sum()/len(df) > 30].index.tolist()

['tag', 'leadquality', 'asymactidx', 'asymprofidx']


df.drop(hi_nulls, axis=1, inplace= True)


cat_vars = list(set(cat_vars)- set(hi_nulls))


cat_vars

['origin',
 'updates',
 'mag',
 'curoccp',
 'edforum',
 'reco',
 'srch',
 'lastnotact',
 'reason',
 'splz',
 'country',
 'lastact',
 'chkpay',
 'city',
 'newsart',
 'dnc',
 'digitalad',
 'supchncont',
 'newsad',
 'leadnum',
 'leadsrc',
 'dmcontent',
 'freecp',
 'dne',
 'leadprofile']


df.target.value_counts()

0    5679
1    3561
Name: target, dtype: int64


df.loc[:,cat_vars].isnull().sum()

origin           36
updates           0
mag               0
curoccp        2690
edforum           0
reco              0
srch              0
lastnotact        0
reason         2709
splz           1438
country        2461
lastact         103
chkpay            0
city           1420
newsart           0
dnc               0
digitalad         0
supchncont        0
newsad            0
leadnum           0
leadsrc        2207
dmcontent         0
freecp            0
dne               0
leadprofile    2709
dtype: int64


sns.catplot(x="target", col="city", col_wrap=4,
                data=df.fillna('NA'),
                kind="count", height=3, aspect=.8)

<seaborn.axisgrid.FacetGrid at 0x7f5239329350>


df.city.fillna('NA',inplace=True)


df.origin.value_counts(normalize=True)

Google               0.311604
Direct Traffic       0.276293
Olark Chat           0.190678
Organic Search       0.125380
Reference            0.058018
Welingak Website     0.015428
Referral Sites       0.013581
Facebook             0.005976
bing                 0.000652
google               0.000543
Click2call           0.000435
Press_Release        0.000217
Social Media         0.000217
Live Chat            0.000217
NC_EDM               0.000109
welearnblog_Home     0.000109
blog                 0.000109
Pay per Click Ads    0.000109
testone              0.000109
youtubechannel       0.000109
WeLearn              0.000109
Name: origin, dtype: float64


df['origin']= df.origin.str.lower()


df.origin.value_counts(normalize=True)[df.origin.value_counts(normalize=True) < .1].index

Index(['reference', 'welingak website', 'referral sites', 'facebook', 'bing',
       'click2call', 'live chat', 'press_release', 'social media',
       'welearnblog_home', 'blog', 'testone', 'youtubechannel', 'nc_edm',
       'pay per click ads', 'welearn'],
      dtype='object')


smalls= df.origin.value_counts(normalize=True)[df.origin.value_counts(normalize=True) < .1].index.tolist()


small_map= dict(zip(smalls, (len(smalls)*'others ').split(' ')[:-1]))


dict(zip(smalls, (len(smalls)*'others ').split(' ')[:-1]))

{'bing': 'others',
 'blog': 'others',
 'click2call': 'others',
 'facebook': 'others',
 'live chat': 'others',
 'nc_edm': 'others',
 'pay per click ads': 'others',
 'press_release': 'others',
 'reference': 'others',
 'referral sites': 'others',
 'social media': 'others',
 'testone': 'others',
 'welearn': 'others',
 'welearnblog_home': 'others',
 'welingak website': 'others',
 'youtubechannel': 'others'}


df['origin']= df.origin.map(small_map).fillna(df.origin)


df.origin.value_counts()

google            2873
direct traffic    2543
olark chat        1755
organic search    1154
others             879
Name: origin, dtype: int64


sns.catplot(x="target", col="origin", col_wrap=2,
                data=df.fillna('NA'),
                kind="count", height=5, aspect=1)

<seaborn.axisgrid.FacetGrid at 0x7f5238f04a50>

	prospsid	totviz	tos	pageviz	asymactscr	asymprofscr
count	9240.000000	9103.000000	9240.000000	9103.000000	5022.000000	5022.000000
mean	617188.435606	3.445238	487.698268	2.362820	14.306252	16.344883
std	23405.995698	4.854853	548.021466	2.161418	1.386694	1.811395
min	579533.000000	0.000000	0.000000	0.000000	7.000000	11.000000
25%	596484.500000	1.000000	12.000000	1.000000	14.000000	15.000000
50%	615479.000000	3.000000	248.000000	2.000000	14.000000	16.000000
75%	637387.250000	5.000000	936.000000	3.000000	15.000000	18.000000
max	660737.000000	251.000000	2272.000000	55.000000	18.000000	20.000000

10. Data Wangling & Cleaning¶

10.1 Load data and High level review¶

10.1.1 Assign 'handles'¶

10.2 Review Target Variable¶

10.3 Variable Types¶

10.3.1 Categorical Variables [Non-Numeric]¶

10.3.2 Numeric Variables¶

10.4 Cleaning Numeric Fields¶

10.4.1 `totviz`¶

10.4.1.1 `Null` treatment of Numeric Values¶

10.4.1.2 Dropping all records with null values¶

10.4.1.3 Outlier treatment of Numeric Values¶

10.5 Categorical Values¶

10.5.1 Low density missing values¶

10.5.2 High density missing values¶

10.5.3 Reducing the number of categories¶

10.6 Methods not covered¶

	count	unique	top	freq
leadnum	9240	5	Landing Page Submission	4886
origin	9204	21	Google	2868
dne	9240	2	No	8506
dnc	9240	2	No	9238
lastact	9137	17	Email Opened	3437
country	6779	38	India	6492
splz	7802	19	Select	1942
leadsrc	7033	10	Select	5043
curoccp	6550	6	Unemployed	5600
reason	6531	3	Better Career Prospects	6528
srch	9240	2	No	9226
mag	9240	1	No	9240
newsart	9240	2	No	9238
edforum	9240	2	No	9239
newsad	9240	2	No	9239
digitalad	9240	2	No	9236
reco	9240	2	No	9233
updates	9240	1	No	9240
tag	5887	26	Will revert after reading the email	2072
leadquality	4473	5	Might be	1560
supchncont	9240	1	No	9240
dmcontent	9240	1	No	9240
leadprofile	6531	6	Select	4146
city	7820	7	Mumbai	3222
asymactidx	5022	3	02.Medium	3839
asymprofidx	5022	3	02.Medium	2788
chkpay	9240	1	No	9240
freecp	9240	2	No	6352
lastnotact	9240	16	Modified	3407

10. Data Wangling & Cleaning¶

10.1 Load data and High level review¶

10.1.1 Assign 'handles'¶

10.2 Review Target Variable¶

10.3 Variable Types¶

10.3.1 Categorical Variables [Non-Numeric]¶

10.3.2 Numeric Variables¶

10.4 Cleaning Numeric Fields¶

10.4.1 totviz¶

10.4.1.1 Null treatment of Numeric Values¶

10.4.1.2 Dropping all records with null values¶

10.4.1.3 Outlier treatment of Numeric Values¶

10.5 Categorical Values¶

10.5.1 Low density missing values¶

10.5.2 High density missing values¶

10.5.3 Reducing the number of categories¶

10.6 Methods not covered¶

10.4.1 `totviz`¶

10.4.1.1 `Null` treatment of Numeric Values¶